{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n"
     ]
    }
   ],
   "source": [
    "from transformers import GPT2Tokenizer\n",
    "from transformers import (\n",
    "    AutoTokenizer,\n",
    "    T5ForConditionalGeneration,\n",
    "    AutoConfig,\n",
    ")\n",
    "from tokenizers import AddedToken\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained('malaysia-ai/bpe-tokenizer')\n",
    "tokenizer_t5 = AutoTokenizer.from_pretrained('google/t5-v1_1-base')\n",
    "additional = []\n",
    "for t in tokenizer_t5.additional_special_tokens:\n",
    "    additional.append(AddedToken(t))\n",
    "tokenizer.add_special_tokens({\"additional_special_tokens\": additional})\n",
    "tokenizer.pad_token = '<s>'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('./out-small-1.1/tokenizer_config.json',\n",
       " './out-small-1.1/special_tokens_map.json',\n",
       " './out-small-1.1/tokenizer.json')"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.save_pretrained('./out-small-1.1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "config = AutoConfig.from_pretrained(\n",
    "    'google/t5-v1_1-base'\n",
    ")\n",
    "config.dropout_rate = 0.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "config.num_heads = 8\n",
    "config.num_layers = 6\n",
    "config.d_model = 512\n",
    "config.num_decoder_layers = 6"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = T5ForConditionalGeneration(\n",
    "    config,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.save_pretrained('./out-small-1.1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total 343M\r\n",
      "-rw-r--r-- 1 ubuntu ubuntu  789 Apr 14 02:28 config.json\r\n",
      "-rw-r--r-- 1 ubuntu ubuntu  142 Apr 14 02:28 generation_config.json\r\n",
      "-rw-r--r-- 1 ubuntu ubuntu 342M Apr 14 02:28 model.safetensors\r\n",
      "-rw-r--r-- 1 ubuntu ubuntu  15K Apr 14 02:28 special_tokens_map.json\r\n",
      "-rw-r--r-- 1 ubuntu ubuntu 1.3M Apr 14 02:28 tokenizer.json\r\n",
      "-rw-r--r-- 1 ubuntu ubuntu  21K Apr 14 02:28 tokenizer_config.json\r\n"
     ]
    }
   ],
   "source": [
    "!ls -lh out-small-1.1"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
